###########################################
#import

source('sosci_import_computational_journalism_2015-01-29.r')
se <- function(x, na.rm=F) { sd(x, na.rm)/sqrt(length(x)) }

###########################################
#cleaning data

#count entries from first SoSci panel email
nrow(dfData[dfData$STARTED >= as.POSIXct('2014-12-05'),])
#degrade from sosci
dfData <- dfData[dfData$DEGRADE < 100,]
#only if second stimulus was answered
dfData <- dfData[dfData$LASTPAGE >= 6,]
#age
dfData <- dfData[dfData$SD02_01 > 12 & dfData$SD02_01 < 99,]
#dependent variable not completely NA
dfData <- dfData[!!rowSums(!is.na(dfData[,c(sprintf('EX01_%0.2d', 1:12), sprintf('EX02_%0.2d', 1:12))])),]


###########################################
#socio demographics

#n
nrow(dfData)
#gender (female = 1, male = 2)
table(dfData$SD01)/nrow(dfData)
#age
mean(dfData$SD02_01, na.rm=T)
#date range
min(dfData$STARTED)
max(dfData$STARTED)
#response time
mean(dfData$TIME_SUM, na.rm=T)/60
sd(dfData$TIME_SUM, na.rm=T)/60


###########################################
#recode into case-wise df (instead of person-wise)

#duplicate
dfData$reihenfolge <- 1
dfData$thema <- ifelse(dfData$UV04_01 == 1, 'Fussball', 'Finanzen')
dfData$aq <- sapply(dfData$UV04_02, function(x) {
  switch(x,
    a = 'J',
    b = 'R',
    c = 'R',
    d = 'J'
  )
})
dfData$tq <- sapply(dfData$UV04_02, function(x) {
  switch(x,
    a = 'J',
    b = 'J',
    c = 'R',
    d = 'R'
  )
})

dfDataTemp <- dfData
dfDataTemp$reihenfolge <- 2
dfDataTemp$thema <- ifelse(dfDataTemp$thema == 'Fussball', 'Finanzen', 'Fussball')
dfDataTemp$aq <- sapply(dfDataTemp$UV04_03, function(x) {
  switch(x,
    a = 'J',
    b = 'R',
    c = 'R',
    d = 'J'
  )
})
dfDataTemp$tq <- sapply(dfDataTemp$UV04_03, function(x) {
  switch(x,
    a = 'J',
    b = 'J',
    c = 'R',
    d = 'R'
  )
})

dfData <- rbind(dfData, dfDataTemp)
rm(dfDataTemp)

#reformatting
dfData$thema <- as.factor(dfData$thema)
dfData$reihenfolge <- as.factor(dfData$reihenfolge)
dfData$aq <- as.factor(dfData$aq)
dfData$tq <- as.factor(dfData$tq)
dfData$num_aq <- as.numeric(dfData$aq)
dfData$num_tq <- ifelse(dfData$tq == 'J', 3, 4)
dfData$num_thema <- ifelse(dfData$thema == 'Finanzen', 2, 1)
dfData$num_aqxtq <- dfData$num_aq * dfData$num_tq

#group creation
dfData$group <- paste(dfData$thema, paste0('aq', dfData$aq), paste0('tq', dfData$tq))

#scale creation
for(i in 1:12) {
  dfData[, paste0('qual_', i)] <- ifelse(dfData$thema == 'Fussball', dfData[, sprintf('EX01_%0.2d', i)], dfData[, sprintf('EX02_%0.2d', i)])
}

#remove original scale columns in order to reduce misinterpretation potential
dfData <- dfData[, !(colnames(dfData) %in% c(sprintf('EX01_%0.2d', 1:12), sprintf('EX02_%0.2d', 1:12)))]

###########################################
#scale validation
library(psy)
cronbach(dfData[, c('qual_1', 'qual_2', 'qual_3', 'qual_4')])
cronbach(dfData[, c('qual_5', 'qual_6', 'qual_7', 'qual_8')])
cronbach(dfData[, c('qual_9', 'qual_10', 'qual_11', 'qual_12')])


###########################################
#index creation
dfData$cred <- rowMeans(dfData[, c('qual_1', 'qual_2', 'qual_3', 'qual_4')], na.rm=T)
dfData$like <- rowMeans(dfData[, c('qual_5', 'qual_6', 'qual_7', 'qual_8')], na.rm=T)
dfData$expe <- rowMeans(dfData[, c('qual_9', 'qual_10', 'qual_11', 'qual_12')], na.rm=T)


###########################################
#validity checks
dfData$group <- as.factor(dfData$group)
table(dfData[, c('group', 'SD01')])/nrow(dfData) #gender
summary(aov(dfData$SD02_01 ~ dfData$group)) #age
table(dfData[, c('group', 'KV01_01')])/nrow(dfData) #journalistic experience
#media usage
summary(aov(dfData$MN01_01 ~ dfData$group))
summary(aov(dfData$MN01_02 ~ dfData$group))
summary(aov(dfData$MN01_03 ~ dfData$group))
summary(aov(dfData$MN01_04 ~ dfData$group))
summary(aov(dfData$MN01_05 ~ dfData$group))
summary(aov(dfData$MN01_06 ~ dfData$group))
#topic interest
summary(aov(dfData$MN02_07 ~ dfData$group))
summary(aov(dfData$MN02_08 ~ dfData$group))



###########################################
#main descriptives

#n
nrow(dfData[dfData$thema == 'Fussball' & !is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe),])
nrow(dfData[dfData$thema == 'Finanzen' & !is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe),])

#crosstabs
with(dfData, {
  print('Credibility')
  print(aggregate(cred, list(group), mean, na.rm=T))
  print(aggregate(cred, list(group), se, na.rm=T))
  print(table(!is.na(cred), group))
  
  print('Likability')
  print(aggregate(like, list(group), mean, na.rm=T))
  print(aggregate(like, list(group), se, na.rm=T))
  print(table(!is.na(like), group))
  
  print('Expertise')
  print(aggregate(expe, list(group), mean, na.rm=T))
  print(aggregate(expe, list(group), se, na.rm=T))
  table(!is.na(expe), group)
})

#rowwise comparisons
for(i in c('Fussball', 'Finanzen')) {
  print(paste('thema', i, "nur TQ-Unterschiede"))
  with(dfData[dfData$thema == i,], {
    print('Credibility')
    print(aggregate(cred, list(tq), mean, na.rm=T))
    print(aggregate(cred, list(tq), sd, na.rm=T))
    print(table(!is.na(cred), tq))
    
    print('Likability')
    print(aggregate(like, list(tq), mean, na.rm=T))
    print(aggregate(like, list(tq), sd, na.rm=T))
    print(table(!is.na(like), tq))
    
    print('Expertise')
    print(aggregate(expe, list(tq), mean, na.rm=T))
    print(aggregate(expe, list(tq), sd, na.rm=T))
    table(!is.na(expe), tq)
  })
}

#colwise comparisons
for(i in c('Fussball', 'Finanzen')) {
  print(paste('thema', i, "nur AQ-Unterschiede"))
  with(dfData[dfData$thema == i,], {
    print('Credibility')
    print(aggregate(cred, list(aq), mean, na.rm=T))
    print(aggregate(cred, list(aq), sd, na.rm=T))
    print(table(!is.na(cred), aq))
    
    print('Likability')
    print(aggregate(like, list(aq), mean, na.rm=T))
    print(aggregate(like, list(aq), sd, na.rm=T))
    print(table(!is.na(like), aq))
    
    print('Expertise')
    print(aggregate(expe, list(aq), mean, na.rm=T))
    print(aggregate(expe, list(aq), sd, na.rm=T))
    table(!is.na(expe), aq)
  })
}


###########################################
#visualization

library(ggplot2)
#bar plot for qualities
plotMainEffects <- function(df, title='') {
  dfDataBarPlot <- with(df, rbind(
    data.frame(
      aq = aq,
      tq = tq,
      quality = cred,
      type = 'Credibility'
    ), data.frame(
      aq = aq,
      tq = tq,
      quality = expe,
      type = 'Expertise'
    ), data.frame(
      aq = aq,
      tq = tq,
      quality = like,
      type = 'Liking'
    )
  ))
  levels(dfDataBarPlot$aq) <- c('Journalist', 'Algorithm')
  levels(dfDataBarPlot$tq) <- c('Human-\nWritten', 'Computer-\nGenerated')
  gPlot <- ggplot(dfDataBarPlot, aes(tq, quality, fill=aq)) + 
    coord_cartesian(ylim=c(1, 5)) +
    stat_summary(fun.y=mean, geom='bar', position='dodge') + 
    stat_summary(fun.data=function(x) {
      xM <- mean(x, na.rm=T)
      xSE <- se(x, na.rm=T)
      return(data.frame(
        y = xM,
        ymin = xM - (1.96*xSE),
        ymax = xM + (1.96*xSE)
      ))
    }, geom='errorbar', position=position_dodge(width=.9), width=.2) + 
    facet_wrap(~ type) +
    labs(x='Actual Source', y='Content Perception [5-point scale]', fill='Declared Source') +
    theme_bw(16, 'Arial') +
    scale_fill_manual(values=c("#CCCCCC", "#333333"))
  if(title != '') {
    gPlot <- gPlot + ggtitle(title)
  }
  print(gPlot)
}
plotMainEffects(dfData[dfData$thema == 'Fussball',], 'Fussball')
plotMainEffects(dfData[dfData$thema == 'Finanzen',], 'Finanzen')
plotMainEffects(dfData)

#descriptive data for that
with(dfData[dfData$aq == 'J' & dfData$tq == 'J' & !is.na(dfData$cred),], 
  sprintf('M = %.1f; SE = %.2f; n = %d', mean(cred, na.rm=T), se(cred, na.rm=T), length(cred)))
with(dfData[dfData$aq == 'R' & dfData$tq == 'J' & !is.na(dfData$cred),], 
  sprintf('M = %.1f; SE = %.2f; n = %d', mean(cred, na.rm=T), se(cred, na.rm=T), length(cred)))
with(dfData[dfData$aq == 'J' & dfData$tq == 'R' & !is.na(dfData$cred),], 
  sprintf('M = %.1f; SE = %.2f; n = %d', mean(cred, na.rm=T), se(cred, na.rm=T), length(cred)))
with(dfData[dfData$aq == 'R' & dfData$tq == 'R' & !is.na(dfData$cred),], 
  sprintf('M = %.1f; SE = %.2f; n = %d', mean(cred, na.rm=T), se(cred, na.rm=T), length(cred)))

with(dfData[dfData$aq == 'J' & dfData$tq == 'J' & !is.na(dfData$expe),], 
  sprintf('M = %.1f; SE = %.2f; n = %d', mean(expe, na.rm=T), se(expe, na.rm=T), length(expe)))
with(dfData[dfData$aq == 'R' & dfData$tq == 'J' & !is.na(dfData$expe),], 
  sprintf('M = %.1f; SE = %.2f; n = %d', mean(expe, na.rm=T), se(expe, na.rm=T), length(expe)))
with(dfData[dfData$aq == 'J' & dfData$tq == 'R' & !is.na(dfData$expe),], 
  sprintf('M = %.1f; SE = %.2f; n = %d', mean(expe, na.rm=T), se(expe, na.rm=T), length(expe)))
with(dfData[dfData$aq == 'R' & dfData$tq == 'R' & !is.na(dfData$expe),], 
  sprintf('M = %.1f; SE = %.2f; n = %d', mean(expe, na.rm=T), se(expe, na.rm=T), length(expe)))

with(dfData[dfData$aq == 'J' & dfData$tq == 'J' & !is.na(dfData$like),], 
  sprintf('M = %.1f; SE = %.2f; n = %d', mean(like, na.rm=T), se(like, na.rm=T), length(like)))
with(dfData[dfData$aq == 'R' & dfData$tq == 'J' & !is.na(dfData$like),], 
  sprintf('M = %.1f; SE = %.2f; n = %d', mean(like, na.rm=T), se(like, na.rm=T), length(like)))
with(dfData[dfData$aq == 'J' & dfData$tq == 'R' & !is.na(dfData$like),], 
  sprintf('M = %.1f; SE = %.2f; n = %d', mean(like, na.rm=T), se(like, na.rm=T), length(like)))
with(dfData[dfData$aq == 'R' & dfData$tq == 'R' & !is.na(dfData$like),], 
  sprintf('M = %.1f; SE = %.2f; n = %d', mean(like, na.rm=T), se(like, na.rm=T), length(like)))



###########################################
#(m)anova

#scatter plots for relationships
ggplot(dfData, aes(cred, like)) + geom_point() + geom_smooth(method='lm') + labs(x='credibility', y='readability') + facet_wrap(~ group)
ggplot(dfData, aes(cred, expe)) + geom_point() + geom_smooth(method='lm') + labs(x='credibility', y='journal. exertise') + facet_wrap(~ group)
ggplot(dfData, aes(like, expe)) + geom_point() + geom_smooth(method='lm') + labs(x='readability', y='journal. exertise') + facet_wrap(~ group)
#homogeneity of var-cov matrices
library(pastecs)
with(dfData[!is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe),], print(by(data.frame(cred, like, expe), group, cov)))
#multivariate normality (should NOT be significant)
library(mvnormtest)
with(dfData[!is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe) & dfData$aq == 'J' & dfData$tq == 'J',], {
  mshapiro.test(t(data.frame(cred, like, expe)))
})
with(dfData[!is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe) & dfData$aq == 'R' & dfData$tq == 'J',], {
  mshapiro.test(t(data.frame(cred, like, expe)))
})
with(dfData[!is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe) & dfData$aq == 'J' & dfData$tq == 'R',], {
  mshapiro.test(t(data.frame(cred, like, expe)))
})
with(dfData[!is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe) & dfData$aq == 'R' & dfData$tq == 'R',], {
  mshapiro.test(t(data.frame(cred, like, expe)))
})
#multivariate normality as visual test
library(mvoutlier)
aq.plot(dfData[!is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe), c('cred', 'like', 'expe')])
#==> ergo: too many outliers, therefore using a robust MANOVA

#estimate the model
dfDepVar <- with(dfData[!is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe),], cbind(cred, like, expe))
mAnova <- manova(dfDepVar ~ aq*tq, data=dfData, na.action=na.exclude)
summary(mAnova, intercept=T)
summary.aov(mAnova)
sqrt(with(dfData[!is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe),], summary.lm(aov(cred ~ aq, na.action=na.exclude)))$adj.r.squared)
sqrt(with(dfData[!is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe),], summary.lm(aov(cred ~ tq, na.action=na.exclude)))$adj.r.squared)
sqrt(with(dfData[!is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe),], summary.lm(aov(cred ~ aq*tq, na.action=na.exclude)))$adj.r.squared)

sqrt(with(dfData[!is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe),], summary.lm(aov(like ~ aq, na.action=na.exclude)))$adj.r.squared)
sqrt(with(dfData[!is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe),], summary.lm(aov(like ~ tq, na.action=na.exclude)))$adj.r.squared)
sqrt(with(dfData[!is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe),], summary.lm(aov(like ~ aq*tq, na.action=na.exclude)))$adj.r.squared)

sqrt(with(dfData[!is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe),], summary.lm(aov(expe ~ aq, na.action=na.exclude)))$adj.r.squared)
sqrt(with(dfData[!is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe),], summary.lm(aov(expe ~ tq, na.action=na.exclude)))$adj.r.squared)
sqrt(with(dfData[!is.na(dfData$cred) & !is.na(dfData$like) & !is.na(dfData$expe),], summary.lm(aov(expe ~ aq*tq, na.action=na.exclude)))$adj.r.squared)

nrow(dfDepVar)


###########################################
#influences

#order
with(dfData, aggregate(cred, list(reihenfolge), mean, na.rm=T))
t.test(dfData[dfData$reihenfolge == 1, 'cred'], dfData[dfData$reihenfolge == 2, 'cred'], paired=T, na.action=na.omit)

with(dfData, aggregate(like, list(reihenfolge), mean, na.rm=T))
t.test(dfData[dfData$reihenfolge == 1, 'like'], dfData[dfData$reihenfolge == 2, 'like'], paired=T, na.action=na.omit)

with(dfData, aggregate(expe, list(reihenfolge), mean, na.rm=T))
t.test(dfData[dfData$reihenfolge == 1, 'expe'], dfData[dfData$reihenfolge == 2, 'expe'], paired=T, na.action=na.omit)

#involvement
library(ggm)
with(dfData[dfData$thema == 'Fußball',], {
  #sport
  print(cor.test(MN02_08, cred))
  print(cor.test(MN02_08, like))
  print(cor.test(MN02_08, expe))
})

with(dfData[dfData$thema == 'Finanzen',], {
  #wirtschaft
  print(cor.test(MN02_04, cred))
  print(cor.test(MN02_04, like))
  print(cor.test(MN02_04, expe))
})

with(dfData, dfDataTemp <<- data.frame(
  thema = as.numeric(thema),
  cred = ifelse(is.nan(cred), NA, cred),
  like = ifelse(is.nan(like), NA, like),
  expe = ifelse(is.nan(expe), NA, expe),
  reihenfolge = as.numeric(reihenfolge),
  wirtschaft = MN02_04,
  sport = MN02_08
))
pcor(c('sport', 'cred', 'reihenfolge'), var(dfDataTemp[dfDataTemp$thema == 1,], na.rm=T))
pcor(c('sport', 'like', 'reihenfolge'), var(dfDataTemp[dfDataTemp$thema == 1,], na.rm=T))
pcor(c('sport', 'expe', 'reihenfolge'), var(dfDataTemp[dfDataTemp$thema == 1,], na.rm=T))

pcor(c('wirtschaft', 'cred', 'reihenfolge'), var(dfDataTemp[dfDataTemp$thema == 2,], na.rm=T))
pcor(c('wirtschaft', 'like', 'reihenfolge'), var(dfDataTemp[dfDataTemp$thema == 2,], na.rm=T))
pcor(c('wirtschaft', 'expe', 'reihenfolge'), var(dfDataTemp[dfDataTemp$thema == 2,], na.rm=T))



###########################################
#variations

#journalists v. non-journalists
nrow(dfData[dfData$thema == 'Fussball' & dfData$KV01_01 == T & !is.na(dfData$cred) & !is.na(dfData$expe) & !is.na(dfData$like),])
nrow(dfData[dfData$thema == 'Finanzen' & dfData$KV01_01 == T & !is.na(dfData$cred) & !is.na(dfData$expe) & !is.na(dfData$like),])
for(b in c(T, F)) {
  print(ifelse(b == T, 'Journalists', 'non-journalists'))
  with(dfData[dfData$KV01_01 == b,], {
    print('Credibility')
    print(aggregate(cred, list(num_aqxtq), mean, na.rm=T))
    print(aggregate(cred, list(num_aqxtq), se, na.rm=T))
    print(table(!is.na(cred), num_aqxtq))
    
    print('Likability')
    print(aggregate(like, list(num_aqxtq), mean, na.rm=T))
    print(aggregate(like, list(num_aqxtq), se, na.rm=T))
    print(table(!is.na(like), num_aqxtq))
    
    print('Expertise')
    print(aggregate(expe, list(num_aqxtq), mean, na.rm=T))
    print(aggregate(expe, list(num_aqxtq), se, na.rm=T))
    table(!is.na(expe), num_aqxtq)
  })
}
